suppressPackageStartupMessages({
library(SummarizedExperiment)
library(SEtools)
library(ggplot2)
library(plgINS)
library(edgeR)
})# load splice SE (the individual HeLa & HEK datasets are only used for colData generation)
se.splicing <- readRDS("data/raw/bartel.splicing.raw.SE.rds")
se.hela <- readRDS( "data/bartel.hela.DEA.SE.rds")
se.hek <- readRDS("data/bartel.hek.DEA.SE.rds")# create colData info for SE
colData(se.splicing) <- cbind(colData(se.splicing), rbind(colData(se.hela)[se.hela$miRNA!="CTRL",], colData(se.hek)[se.hek$miRNA!="CTRL",])[colnames(se.splicing),])
# separate the 2 cell lines
se.hela <- se.splicing[,se.splicing$Cell_Line=="HeLa"]
se.hek <- se.splicing[,se.splicing$Cell_Line!="HeLa"]
# filter
sel <- c(10, 2)
se.hela <- se.hela[rowSums(assay(se.hela) >= sel[1]) >= sel[2],]
se.hek <- se.hek[rowSums(assay(se.hek) >= sel[1]) >= sel[2],]# supplementary function integrated into the DEAprep() function below:
# generates control samples out of the average values over all supplied samples
genCTRL <- function(se, readtype, logcpm){
# e.ctrl <- sapply(unique(se$Batch), ls=min(colSums(assays(se)[[readtype]])),
# FUN=function(x,ls){
# rowMedians(exp(assays(se)[[logcpm]][,se$Batch==x])-1) * ls/1000000
# }
# )
e.ctrl <- sapply(unique(se$Batch), ls=colSums(assays(se)[[readtype]]),
FUN=function(x,ls){
rowMedians(expm1(assays(se)[[logcpm]][,se$Batch==x])) * median(ls[se$Batch==x])/1000000
}
)
### generate colnames
n.cols <- sapply( unique(se$Batch), FUN=function(x) var_name <- paste("CTRL", x, sep=".") )
colnames(e.ctrl) <- n.cols
rownames(e.ctrl) <- rownames(se)
return(e.ctrl)
}#' DEAprep
#'
#' Generates control samples out of the average values over all supplied samples, combines spliced and unspliced
#' assays and calculates their logCPM & log2FC data.
#' All is assembled into one SE object.
#'
#'
#' @param se SummarizedExperiment object containing assays of raw counts of spliced & unspliced tx
#'
#' @return an SE object of combined spliced and unspliced data as well as control samples generated from averaging over all treatment samples
#'
DEAprep <- function(se){
# allocation
e1 <- assays(se)$spliced
e2 <- assays(se)$unspliced
readtype1 <- "spliced"
readtype2 <- "unspliced"
# generate control
## create logcpm assays for both spliced & unspliced assays
assays(se)$logcpm.spliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$spliced))))
assays(se)$logcpm.unspliced <- log1p(cpm(calcNormFactors(DGEList(assays(se)$unspliced))))
## generate a 'control' sample out of the median normalized counts over all samples
e.ctrl1 <- genCTRL(se, readtype1, "logcpm.spliced")
e.ctrl2 <- genCTRL(se, readtype2, "logcpm.unspliced")
## add control samples to assays & generate DGEList object
dds1 <- DGEList(cbind(e1, e.ctrl1))
dds2 <- DGEList(cbind(e2, e.ctrl2))
## combine the spliced & unspliced assays
dds <- cbind(dds1, dds2)
# generate colData
dd1 <- rbind(colData(se)[,c("Batch","miRNA","Cell_Line")],
data.frame(Batch=unique(se$Batch), miRNA="CTRL", Cell_Line=unique(se$Cell_Line)))
dd1$readtype <- readtype1
## duplicate dd to have data for combined spliced & unspliced assay
dd <- rbind(dd1, dd1)
dd$readtype[(nrow(dd1)+1):nrow(dd)] <- readtype2
dd$readtype <- as.factor(dd$readtype)
## rename both dds & dd object features
n.cols1 <- sapply(colnames(dds1), FUN=function(x)
var_name <- paste(x, readtype1, sep=".") )
n.cols2 <- sapply(colnames(dds2), FUN=function(x)
var_name <- paste(x, readtype2, sep=".") )
colnames(dds) <- c(n.cols1, n.cols2)
rownames(dd) <- c(n.cols1, n.cols2)
# rebuild SE object
## SE object with logCPM assay
se <- SummarizedExperiment(assays=list(counts=dds$counts),
rowData=rowData(se),
colData=dd)
assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
return(se)
}# plot PCA to check for batch effect HeLa
## combined HeLa
plgINS::plPCA(assays(se.hela)$logcpm, as.data.frame(colData(se.hela)), colorBy = "Batch", shapeBy = "readtype",
add.labels = FALSE, annotation_columns = colnames(colData(se.hela))[c(1,2,4)])# plot PCA to check for batch effect HEK
## combined HEK
plgINS::plPCA(assays(se.hek)$logcpm, as.data.frame(colData(se.hek)), colorBy = "Batch", shapeBy = "readtype",
add.labels = FALSE, annotation_columns = colnames(colData(se.hek))[c(1,2,4)])#' exonDEA
#'
#' @param se SE object containing assays with combined spliced & unspliced counts
#' @param model function: model matrix function to test
#' @param model0 function: model matrix function to be tested against
#' @param control string: name of control samples inside colData() of supplied SE object
#'
#' @return SE object containing DEA results over all treatments & over individual ones
#'
exonDEA <- function(se, model, model0=~1, control){
## allocation
se$miRNA <- relevel(droplevels(se$miRNA), ref=control)
se$readtype <- relevel(se$readtype, ref="unspliced")
se$Batch <- droplevels(se$Batch)
dd <- colData(se)
## normalization
dds <- calcNormFactors(DGEList(assays(se)$counts))
## models
mm <- model.matrix(model, data=dd)
mm0 <- model.matrix(model0, data=dd)
testCoeff <- setdiff(colnames(mm), colnames(mm0))
## estimate dispersion
dds <- estimateDisp(dds,mm)
## fit negative binomial distribution on counts per gene (use glmFit for few replicates)
fit <- glmFit(dds, mm)
## fit a GLM on the data
lrt.comb <- glmLRT(fit, testCoeff)
## top genes that change relative to stage 0
res.comb <- as.data.frame(topTags(lrt.comb, Inf))
## fit linear model dropping one sample at a time (using multiple cores)
res.list <- bplapply( testCoeff, BPPARAM=MulticoreParam(8), FUN=function(x){
as.data.frame(topTags(glmLRT(fit, x),Inf))
})
dea.names <- gsub("readtype","", testCoeff)
dea.names <- make.names(gsub(":miRNA",".", dea.names))
colnames(res.comb)[grepl("logFC",colnames(res.comb))] <- paste0("logFC.", dea.names)
names(res.list) <- paste0("DEA.",dea.names)
## add DEAs
rowData(se)$DEA.spliced.all <- DataFrame(res.comb[rownames(se),])
for(i in paste0("DEA.",dea.names)){
rowData(se)[[i]] <- DataFrame(res.list[[i]][rownames(se),])
}
## add logFC assay
se <- plgINS::svacor(se, form = model, form0 = model0)
se <- SEtools::log2FC(se, controls = se$miRNA==control, by = se$readtype, fromAssay = "corrected", isLog = TRUE)
return(se)
}# bartel hela & hek DEA with exon resolution
## combined readtype:miRNA effect
se.hela.comb <- exonDEA(se.hela, model = ~readtype*miRNA, model0 = ~readtype+miRNA, control="CTRL")## Number of significant surrogate variables is: 6
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.spliced.let.7a DEA.spliced.lsy.6
## 1308 32 44
## DEA.spliced.miR.1 DEA.spliced.miR.124 DEA.spliced.miR.137
## 242 293 115
## DEA.spliced.miR.139 DEA.spliced.miR.143 DEA.spliced.miR.144
## 4 5 0
## DEA.spliced.miR.153 DEA.spliced.miR.155 DEA.spliced.miR.182
## 15 77 77
## DEA.spliced.miR.199a DEA.spliced.miR.204 DEA.spliced.miR.205
## 62 19 14
## DEA.spliced.miR.216b DEA.spliced.miR.223 DEA.spliced.miR.7
## 19 58 181
## Number of significant surrogate variables is: 6
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.spliced.miR.122 DEA.spliced.miR.133
## 1814 36 78
## DEA.spliced.miR.138 DEA.spliced.miR.145 DEA.spliced.miR.184
## 177 85 10
## DEA.spliced.miR.190a DEA.spliced.miR.200b DEA.spliced.miR.216a
## 99 152 25
## DEA.spliced.miR.217 DEA.spliced.miR.219a DEA.spliced.miR.375
## 38 79 107
## DEA.spliced.miR.451a
## 21
# bartel hela & hek DEA with exon resolution
## batch effect included
se.hela.batch <- exonDEA(se.hela, model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA, "CTRL")## Number of significant surrogate variables is: 7
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.spliced.let.7a DEA.spliced.lsy.6
## 6347 433 244
## DEA.spliced.miR.1 DEA.spliced.miR.124 DEA.spliced.miR.137
## 1441 1286 581
## DEA.spliced.miR.139 DEA.spliced.miR.143 DEA.spliced.miR.144
## 38 64 27
## DEA.spliced.miR.153 DEA.spliced.miR.155 DEA.spliced.miR.182
## 175 369 304
## DEA.spliced.miR.199a DEA.spliced.miR.204 DEA.spliced.miR.205
## 231 101 73
## DEA.spliced.miR.216b DEA.spliced.miR.223 DEA.spliced.miR.7
## 183 306 602
## Number of significant surrogate variables is: 5
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.spliced.miR.122 DEA.spliced.miR.133
## 2746 58 134
## DEA.spliced.miR.138 DEA.spliced.miR.145 DEA.spliced.miR.184
## 247 152 23
## DEA.spliced.miR.190a DEA.spliced.miR.200b DEA.spliced.miR.216a
## 148 210 50
## DEA.spliced.miR.217 DEA.spliced.miR.219a DEA.spliced.miR.375
## 79 106 162
## DEA.spliced.miR.451a
## 34
# bartel hela & hek DEA with exon resolution
## miRNA effect
se.hela.test <- exonDEA(se.hela, model = ~readtype+miRNA, model0 = ~readtype, "CTRL")## Number of significant surrogate variables is: 9
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.miRNAlet.7a DEA.miRNAlsy.6 DEA.miRNAmiR.1
## 10952 926 129 2585
## DEA.miRNAmiR.124 DEA.miRNAmiR.137 DEA.miRNAmiR.139 DEA.miRNAmiR.143
## 2139 1923 4 9
## DEA.miRNAmiR.144 DEA.miRNAmiR.153 DEA.miRNAmiR.155 DEA.miRNAmiR.182
## 23 770 222 134
## DEA.miRNAmiR.199a DEA.miRNAmiR.204 DEA.miRNAmiR.205 DEA.miRNAmiR.216b
## 376 103 120 85
## DEA.miRNAmiR.223 DEA.miRNAmiR.7
## 238 764
## Number of significant surrogate variables is: 11
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.miRNAmiR.122 DEA.miRNAmiR.133 DEA.miRNAmiR.138
## 11571 246 212 3030
## DEA.miRNAmiR.145 DEA.miRNAmiR.184 DEA.miRNAmiR.190a DEA.miRNAmiR.200b
## 345 15 452 701
## DEA.miRNAmiR.216a DEA.miRNAmiR.217 DEA.miRNAmiR.219a DEA.miRNAmiR.375
## 109 246 207 533
## DEA.miRNAmiR.451a
## 21
# bartel hela & hek DEA with exon resolution
## miRNA & combined exon:miRNA effect
se.hela.test2 <- exonDEA(se.hela, model = ~readtype*miRNA, model0 = ~readtype, "CTRL")## Number of significant surrogate variables is: 6
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.miRNAlet.7a DEA.miRNAlsy.6
## 7789 37 2
## DEA.miRNAmiR.1 DEA.miRNAmiR.124 DEA.miRNAmiR.137
## 484 239 206
## DEA.miRNAmiR.139 DEA.miRNAmiR.143 DEA.miRNAmiR.144
## 0 0 0
## DEA.miRNAmiR.153 DEA.miRNAmiR.155 DEA.miRNAmiR.182
## 130 11 10
## DEA.miRNAmiR.199a DEA.miRNAmiR.204 DEA.miRNAmiR.205
## 35 7 1
## DEA.miRNAmiR.216b DEA.miRNAmiR.223 DEA.miRNAmiR.7
## 2 5 55
## DEA.spliced.let.7a DEA.spliced.lsy.6 DEA.spliced.miR.1
## 32 44 242
## DEA.spliced.miR.124 DEA.spliced.miR.137 DEA.spliced.miR.139
## 293 115 4
## DEA.spliced.miR.143 DEA.spliced.miR.144 DEA.spliced.miR.153
## 5 0 15
## DEA.spliced.miR.155 DEA.spliced.miR.182 DEA.spliced.miR.199a
## 77 77 62
## DEA.spliced.miR.204 DEA.spliced.miR.205 DEA.spliced.miR.216b
## 19 14 19
## DEA.spliced.miR.223 DEA.spliced.miR.7
## 58 181
## Number of significant surrogate variables is: 6
## Iteration (out of 5 ):1 2 3 4 5
## DEA.spliced.all DEA.miRNAmiR.122 DEA.miRNAmiR.133
## 9699 2 31
## DEA.miRNAmiR.138 DEA.miRNAmiR.145 DEA.miRNAmiR.184
## 738 12 3
## DEA.miRNAmiR.190a DEA.miRNAmiR.200b DEA.miRNAmiR.216a
## 16 55 3
## DEA.miRNAmiR.217 DEA.miRNAmiR.219a DEA.miRNAmiR.375
## 103 15 136
## DEA.miRNAmiR.451a DEA.spliced.miR.122 DEA.spliced.miR.133
## 0 36 78
## DEA.spliced.miR.138 DEA.spliced.miR.145 DEA.spliced.miR.184
## 177 85 10
## DEA.spliced.miR.190a DEA.spliced.miR.200b DEA.spliced.miR.216a
## 99 152 25
## DEA.spliced.miR.217 DEA.spliced.miR.219a DEA.spliced.miR.375
## 38 79 107
## DEA.spliced.miR.451a
## 21
# heatmap function
makeHM <- function(se, sig, nr=20, logcpm=FALSE){
for(i in sig){
if(sum(rowData(se)$DEA.spliced.all$FDR < i) > nr){
cat("FDR <", i, "\n")
# logFC heatmap
SEtools::sehm(se[,order(colData(se)$miRNA)], row.names(se)[rowData(se)$DEA.spliced.all$FDR < i], gaps_at = "readtype",
breaks=TRUE, do.scale = FALSE, show_colnames = FALSE, assayName = "log2FC", anno_columns = c("miRNA","readtype"))
# logcpm heatmap
se.sp <- se[,colData(se)$readtype=="spliced"]
if(logcpm){
SEtools::sehm(se.sp[,order(colData(se.sp)$miRNA)], row.names(se.sp)[rowData(se.sp)$DEA.spliced.all$FDR < i], gaps_at = "readtype",
breaks=TRUE, do.scale = TRUE, show_colnames = FALSE, assayName = "logcpm", anno_columns = c("miRNA","readtype"))
}
break
}
}
}## FDR < 1e-05
# batch [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
makeHM(se.hela.batch, sig.lvl)## FDR < 1e-05
## FDR < 1e-05
## FDR < 1e-05
# logCPM vs logFC scatterplot HeLa
LSD:::heatscatter(as.vector(assays(se.hela.comb)$logcpm),as.vector(assays(se.hela.comb)$log2FC))
abline(a=0, b=0)## FDR < 1e-05
# batch [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
makeHM(se.hek.batch, sig.lvl)## FDR < 1e-05
## FDR < 1e-05
## FDR < 1e-05
# logCPM vs logFC scatterplot HEK
LSD:::heatscatter(as.vector(assays(se.hek.comb)$logcpm),as.vector(assays(se.hek.comb)$log2FC))
abline(a=0, b=0)# spliced v unspliced HeLa
lib.hela.spliced <- apply(assays(se.hela.comb[,colData(se.hela.comb)$readtype=="spliced"])$counts, 2, function(x) sum(x))
lib.hela.unspliced <- apply(assays(se.hela.comb[,colData(se.hela.comb)$readtype=="unspliced"])$counts, 2, function(x) sum(x))
median(lib.hela.unspliced / lib.hela.spliced)## [1] 0.2186814
# spliced v unspliced HEK
lib.hek.spliced <- apply(assays(se.hek.comb[,colData(se.hek.comb)$readtype=="spliced"])$counts, 2, function(x) sum(x))
lib.hek.unspliced <- apply(assays(se.hek.comb[,colData(se.hek.comb)$readtype=="unspliced"])$counts, 2, function(x) sum(x))
median(lib.hek.unspliced / lib.hek.spliced)## [1] 0.1995575
# check number of positive & negative logFC for different significances
sigsDF <- function(se, sig, dea, thr){
data.frame(sigLevel=rep(sig,3),
counts=c(sapply(sig, function(x) sum(sign(assays(se[rowData(se)[[dea]]$FDR < x,])$log2FC) == -1 &
abs(assays(se[rowData(se)[[dea]]$FDR < x,])$log2FC) >= thr)),
sapply(sig, function(x) sum(sign(assays(se[rowData(se)[[dea]]$FDR < x,])$log2FC) == 0 &
abs(assays(se[rowData(se)[[dea]]$FDR < x,])$log2FC) >= thr)),
sapply(sig, function(x) sum(sign(assays(se[rowData(se)[[dea]]$FDR < x,])$log2FC) == 1 &
abs(assays(se[rowData(se)[[dea]]$FDR < x,])$log2FC) >= thr))
),
sign=c(rep("-1",length(sig)),
rep("0",length(sig)),
rep("1",length(sig))
),
dea=rep(dea,length(sig)*3)
)
}# for each model & DEA: find number of down- & upregulations at different significance levels
## significance levels of interest
sig <- c(1e-10, 1e-5, .05, .1, .5)
## only absolute log2FC greater than this will be considered
fc.thr <- .5
## create dataframes with count informations
### HeLa [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
sigs.hela <- lapply(grep("DEA",colnames(rowData(se.hela.comb)),value=TRUE), function(x){
sigsDF(se.hela.comb, sig, x, fc.thr)
})
sigs.hela <- data.frame(do.call("rbind",sigs.hela))
### HeLa [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
sigs.hela.batch <- lapply(grep("DEA",colnames(rowData(se.hela.batch)),value=TRUE), function(x){
sigsDF(se.hela.batch, sig, x, fc.thr)
})
sigs.hela.batch <- data.frame(do.call("rbind",sigs.hela.batch))
### HEK [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
sigs.hek <- lapply(grep("DEA",colnames(rowData(se.hek.comb)),value=TRUE), function(x){
sigsDF(se.hek.comb, sig, x, fc.thr)
})
sigs.hek <- data.frame(do.call("rbind",sigs.hek))
### HEK [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
sigs.hek.batch <- lapply(grep("DEA",colnames(rowData(se.hek.batch)),value=TRUE), function(x){
sigsDF(se.hek.batch, sig, x, fc.thr)
})
sigs.hek.batch <- data.frame(do.call("rbind",sigs.hek.batch))### [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
ggplot(sigs.hela[sigs.hela$dea=="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity")### [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
ggplot(sigs.hela[sigs.hela$dea!="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity") + facet_wrap(~dea)### [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
ggplot(sigs.hela.batch[sigs.hela.batch$dea=="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity")### [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
ggplot(sigs.hela.batch[sigs.hela.batch$dea!="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity") + facet_wrap(~dea)### HEK [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
ggplot(sigs.hek[sigs.hek$dea=="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity")### HEK [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
ggplot(sigs.hek[sigs.hek$dea!="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity") + facet_wrap(~dea)### HEK [model = ~Batch+readtype*miRNA, model0 = ~Batch+readtype+miRNA]
ggplot(sigs.hek.batch[sigs.hek.batch$dea=="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity")### HEK [model = ~readtype*miRNA, model0 = ~readtype+miRNA]
ggplot(sigs.hek.batch[sigs.hek.batch$dea!="DEA.spliced.all",], aes(x=as.factor(sigLevel), y=counts, fill=sign)) + geom_bar(position="dodge", stat="identity") + facet_wrap(~dea)# load detailed Bartel treatment info containing exact sequences
pert <- read.csv("data/bartel_treatments.txt", sep = "\t")
# we extract HeLa & HEK data (we disregard passenger strands as long as we're working with TargetScan)
colnames(pert) <- c("treatment","seq")
pert <- list(pert[1:17,], pert[37:48,])
names(pert) <- c("hela","hek")
# find out where the seed sequence begins
#unlist(gregexpr(pattern ="GAGGUAG",pert$seq))
#unlist(gregexpr(pattern ="GGAAUGU",pert$seq))
# extract seed sequences
pert <- lapply(pert, function(x){
seed <- data.frame(seed=sapply(x$seq, function(y) substring(y ,2, 8)))
x <- cbind(x, seed)
})# Check if log2FC upregulations are due to skewed controls
ctrlAnalysis <- function(se, pert, TS, fc.degree){
## vector of Bartel miRNA treatments and their corresponding seeds (based on actual miRNA seqs)
pert.seed <- sapply(se$miRNA[se$miRNA!="CTRL" & se$readtype=="spliced"], function(x){
as.character(pert$seed[grepl(paste0(x,"$"), pert$treatment) | grepl(paste0(x,"\\("), pert$treatment)])
})
names(pert.seed) <- se$miRNA[se$miRNA!="CTRL" & se$readtype=="spliced"]
## adjusted gene names to be identical to TS names
fc.genes <- as.character(rowData(se)$gene_name)
## only consider common genes
id <- fc.genes %in% TS$feature
## significant ids
id.sig <- id & rowData(se)$DEA.spliced.all$FDR < .05
## aggregate logFC by genes
logfc <- cbind(as.data.frame(assays(se)$log2FC),fc.genes)
logfc <- aggregate(logfc[,colnames(logfc)!="fc.genes"], by=list(logfc$fc.genes), FUN="sum")
rownames(logfc) <- logfc$Group.1
logfc.sign <- logfc.sign.deg <- sign(logfc[,colnames(logfc)!="Group.1"])
## update gene names, DEA and IDs: aggregate by genes
### aggregate DEA FDR info
fdr <- rowData(se)$DEA.spliced.all
rownames(fdr) <- fc.genes
fdr <- aggregate(fdr[fc.genes[id],"FDR"], by=list(rownames(fdr[fc.genes[id],])), FUN="min")
rownames(fdr) <- fdr$Group.1
colnames(fdr) <- c("genes","FDR")
### adjusted gene names to be identical to TS names
fc.genes <- rownames(logfc)
### only consider common genes
id <- fc.genes %in% TS$feature
### significant ids
id.sig <- fc.genes %in% rownames(fdr)[fdr$FDR<.05]
## for splicing: only genes common with TS & with logfc createer than 'fc.degree' & significant FDR
logfc.sign.deg[logfc[,colnames(logfc)!="Group.1"] < fc.degree] <- 0
logfc.sign.deg <- logfc.sign.deg[rownames(logfc.sign.deg) %in% fc.genes[id.sig] & rowSums(logfc.sign.deg)>0, se$miRNA!="CTRL"]
## for control: only genes common with TS
logfc.sign <- logfc.sign[rownames(logfc.sign) %in% fc.genes[id], se$miRNA!="CTRL"]
## to find treatment miRNA for each gene for which the logFC are upregulated
nonUpregSeeds <- function(readtype, logfc, pert.seed){
l <- lapply(readtype, function(i){
apply(logfc[,readtype==i], 1, function(x){
pert.seed[x!=1]
})
})
return(l)
}
## splicing: find treatment miRNAs potentially targeting each gene
n <- nonUpregSeeds(unique(se$readtype), logfc.sign.deg, pert.seed)
names(n) <- unique(se$readtype)
## control: background distribution of upregulation instances for each gene
m <- nonUpregSeeds(unique(se$readtype), logfc.sign, pert.seed)
names(m) <- paste0("ctrl.",unique(se$readtype))
## combine
n <- c(n,m)
## for each common gene select the miRNA families that target them (using TargetScan data)
ts.all <- lapply(fc.genes[id], function(x){
TS$family[TS$feature==x]
})
names(ts.all) <- fc.genes[id]
ts.list <- list(splicing=ts.all[fc.genes[id.sig]], ctrl=ts.all[fc.genes[id]])
## ratio of treatment miRNA are in the TS target list
ratios <- lapply(n, function(x){
if( length(x)==nrow(logfc.sign) ){
sapply( names(x), function(y) sum(x[[y]] %in% ts.list$ctrl[[y]])/length(x[[y]]) )
} else {
sapply( names(x), function(y) sum(x[[y]] %in% ts.list$splicing[[y]])/length(x[[y]]) )
}
})
# output
return(list(n=n,ratios=ratios))
}# Check if log2FC upregulations are due to skewed controls
## call function
ca.hela <- ctrlAnalysis(se.hela.comb, pert$hela, TS, .5)
ca.hek <- ctrlAnalysis(se.hek.comb, pert$hek, TS, .5)# function to generate dataframes for plotting
dfGen <- function(ca){
df <- data.frame(
counts=unlist(lapply(ca$n, function(x) sapply(x, length))),
ratios=unlist(lapply(ca$ratios, function(x) x)),
id=unlist(lapply(names(ca$n), function(x) rep(x, length(ca$n[[x]])))),
readtype=c(rep("spliced",length(ca$n$spliced)),
rep("unspliced",length(ca$n$unspliced)),
rep("spliced",length(ca$n$ctrl.spliced)),
rep("unspliced",length(ca$n$ctrl.unspliced)) )
)
return(df)
}# plots HeLa
## to be able to affect the controls, the combined targeting of the treatment miRNAs must be in the majority
ggplot(df$hela, aes(counts)) + geom_density(aes(fill=id), alpha=0.9, na.rm=TRUE) + facet_wrap(~readtype) +
geom_vline(aes(xintercept = ncol(se.hela.comb[,se.hela.comb$miRNA!="CTRL" & se.hela.comb$readtype=="spliced"])/2), color="red")n.means <- sapply(unique(df$hela$id), function(x) mean(df$hela$counts[df$hela$id==x]))
names(n.means) <- unique(df$hela$id)
n.means## spliced unspliced ctrl.spliced ctrl.unspliced
## 32.37133 32.88256 16.89632 16.74532
n.medians <- sapply(unique(df$hela$id), function(x) median(df$hela$counts[df$hela$id==x]))
names(n.medians) <- unique(df$hela$id)
n.medians## spliced unspliced ctrl.spliced ctrl.unspliced
## 33 33 17 17
## check what ratio of treatment miRNA are in the TS target list (logit)
ggplot(df$hela, aes(log(ratios/(1 - ratios)))) + geom_density(aes(fill=id), alpha=0.5, na.rm=TRUE) + facet_wrap(~readtype)## check what ratio of treatment miRNA are in the TS target list
ggplot(df$hela, aes(ratios)) + geom_density(aes(fill=id), alpha=0.6, na.rm=TRUE) + facet_wrap(~readtype)## spliced unspliced ctrl.spliced ctrl.unspliced
## 0.12986543 0.12855534 0.06738572 0.06554005
## spliced unspliced ctrl.spliced ctrl.unspliced
## 0.1212121 0.1176471 0.0000000 0.0000000
# plots HEK
## to be able to affect the controls, the combined targeting of the treatment miRNAs must be in the majority
ggplot(df$hek, aes(counts)) + geom_density(aes(fill=id), alpha=0.9, na.rm=TRUE) + facet_wrap(~readtype) +
geom_vline(aes(xintercept = ncol(se.hek.comb[,se.hek.comb$miRNA!="CTRL" & se.hek.comb$readtype=="spliced"])/2), color="red")n.means <- sapply(unique(df$hek$id), function(x) mean(df$hek$counts[df$hek$id==x]))
names(n.means) <- unique(df$hek$id)
n.means## spliced unspliced ctrl.spliced ctrl.unspliced
## 23.01278 23.33227 11.72396 11.86510
n.medians <- sapply(unique(df$hek$id), function(x) median(df$hek$counts[df$hek$id==x]))
names(n.medians) <- unique(df$hek$id)
n.medians## spliced unspliced ctrl.spliced ctrl.unspliced
## 23 24 12 12
## check what ratio of treatment miRNA are in the TS target list (logit)
ggplot(df$hek, aes(log(ratios/(1 - ratios)))) + geom_density(aes(fill=id), alpha=0.5, na.rm=TRUE) + facet_wrap(~readtype)## check what ratio of treatment miRNA are in the TS target list
ggplot(df$hek, aes(ratios)) + geom_density(aes(fill=id), alpha=0.6, na.rm=TRUE) + facet_wrap(~readtype)## spliced unspliced ctrl.spliced ctrl.unspliced
## 0.12986543 0.12855534 0.06738572 0.06554005
## spliced unspliced ctrl.spliced ctrl.unspliced
## 0.1212121 0.1176471 0.0000000 0.0000000
Summary: As expected, there seem to be genes that are similarly upregulated over a majority of the treatment miRNA. They thus skew the control samples we created (average counts over all treatments for each gene). To eliminate this distortion we exclude these genes from the datasets.
# exclude skewing genes
se.hela.batch <- se.hela.batch[!(rowData(se.hela.batch)$gene_name %in% names(ca.hela$n$spliced)),]
se.hek.batch <- se.hek.batch[!(rowData(se.hek.batch)$gene_name %in% names(ca.hek$n$spliced)),]